Load Theme for plots

theme_set(theme_fivethirtyeight())
theme_update(axis.title = element_text()) #the default for fivethirtyeight is to not show axis labels, this removes that default so we can choose to specify and display axis titles
theme_update(plot.title = element_text(hjust = 0.5)) # changing default to center all titles

Load Data downloaded from UCI and stored on github repo https://archive.ics.uci.edu/ml/datasets/Adult

adult = read.csv("/Users/Camo/Documents/SMU_DS/Applied Stats/Project2/Predicting_Income/adult.data", header = FALSE)

Description of variables from UCI:

Response: >50K, <=50K.

age: continuous. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. fnlwgt: continuous. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. education-num: continuous. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. sex: Female, Male. capital-gain: continuous. capital-loss: continuous. hours-per-week: continuous. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

Add column names to data set:

# NOTE: names using underscore instead of hyphen so they can be referenced easier later
colnames(adult) <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income")

Investigate NA values to determine what needs resolution

aggr_plot <- aggr(adult, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(adult), cex.axis=.7, gap=3, ylab=c("Percent data missing","Combinations Missing"))

## 
##  Variables sorted by number of missings: 
##        Variable Count
##             age     0
##       workclass     0
##          fnlwgt     0
##       education     0
##   education_num     0
##  marital_status     0
##      occupation     0
##    relationship     0
##            race     0
##             sex     0
##    capital_gain     0
##    capital_loss     0
##  hours_per_week     0
##  native_country     0
##          income     0
#Note there are not missing values showing up initially but that's because the missing values are represented by "?" instead of NA

#Replace "?" with NA and re-do missing value analysis
adult[, 1:14][adult[, 1:14] == " ?"] <- NA

aggr_plot <- aggr(adult, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(adult), cex.axis=.7, gap=3, ylab=c("Percent data missing","Combinations Missing"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05660146
##       workclass 0.05638647
##  native_country 0.01790486
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education_num 0.00000000
##  marital_status 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital_gain 0.00000000
##    capital_loss 0.00000000
##  hours_per_week 0.00000000
##          income 0.00000000
marginplot(adult[c(2,7)])

marginplot(adult[c(2,14)])

marginplot(adult[c(7,14)])

#occupation missing 5.66% of values
#workclass missing 5.64% of values
#native-country missing 1.79& of values
#Note that half of the missing workclass values occur on observations that are also missing occupation

Examine formats of data available

#Remove leading whitespace
adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")]=as.data.frame(apply(adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")],2,function(x)gsub('\\s+', '',x)))

#Convert character vars to factors and make list of vars
adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$occupation <- as.factor(adult$occupation)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

categorical.explanatory = c("workclass","education","marital_status","occupation","relationship","race","sex","native_country")

str(adult)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education_num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital_status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital_gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours_per_week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native_country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(adult)
##       age                   workclass         fnlwgt       
##  Min.   :17.00   Private         :22696   Min.   :  12285  
##  1st Qu.:28.00   Self-emp-not-inc: 2541   1st Qu.: 117827  
##  Median :37.00   Local-gov       : 2093   Median : 178356  
##  Mean   :38.58   State-gov       : 1298   Mean   : 189778  
##  3rd Qu.:48.00   Self-emp-inc    : 1116   3rd Qu.: 237051  
##  Max.   :90.00   (Other)         :  981   Max.   :1484705  
##                  NA's            : 1836                    
##         education     education_num                 marital_status 
##  HS-grad     :10501   Min.   : 1.00   Divorced             : 4443  
##  Some-college: 7291   1st Qu.: 9.00   Married-AF-spouse    :   23  
##  Bachelors   : 5355   Median :10.00   Married-civ-spouse   :14976  
##  Masters     : 1723   Mean   :10.08   Married-spouse-absent:  418  
##  Assoc-voc   : 1382   3rd Qu.:12.00   Never-married        :10683  
##  11th        : 1175   Max.   :16.00   Separated            : 1025  
##  (Other)     : 5134                   Widowed              :  993  
##            occupation            relationship                   race      
##  Prof-specialty : 4140   Husband       :13193   Amer-Indian-Eskimo:  311  
##  Craft-repair   : 4099   Not-in-family : 8305   Asian-Pac-Islander: 1039  
##  Exec-managerial: 4066   Other-relative:  981   Black             : 3124  
##  Adm-clerical   : 3770   Own-child     : 5068   Other             :  271  
##  Sales          : 3650   Unmarried     : 3446   White             :27816  
##  (Other)        :10993   Wife          : 1568                             
##  NA's           : 1843                                                    
##      sex         capital_gain    capital_loss    hours_per_week 
##  Female:10771   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :21790   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1078   Mean   :  87.3   Mean   :40.44  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :4356.0   Max.   :99.00  
##                                                                 
##        native_country    income     
##  United-States:29170   <=50K:24720  
##  Mexico       :  643   >50K : 7841  
##  Philippines  :  198                
##  Germany      :  137                
##  Canada       :  121                
##  (Other)      : 1709                
##  NA's         :  583

#This is where I started making changes after Rick’s contribution above

Notes: -I corrected white space issue before variable types were corrected -I added additional libraries at top

Remaining missing values visualized

#Proportion and total missing values
missing=data.frame(miss_var_cumsum(adult))
missing$prop_miss = missing$n_miss/dim(adult)[1]
missing[,-3] %>% arrange(-n_miss)
##          variable n_miss  prop_miss
## 1      occupation   1843 0.05660146
## 2       workclass   1836 0.05638647
## 3  native_country    583 0.01790486
## 4             age      0 0.00000000
## 5          fnlwgt      0 0.00000000
## 6       education      0 0.00000000
## 7   education_num      0 0.00000000
## 8  marital_status      0 0.00000000
## 9    relationship      0 0.00000000
## 10           race      0 0.00000000
## 11            sex      0 0.00000000
## 12   capital_gain      0 0.00000000
## 13   capital_loss      0 0.00000000
## 14 hours_per_week      0 0.00000000
## 15         income      0 0.00000000
#Chart of missing values by variable
gg_miss_var(adult)

Looking at categorical variable class balance

cat_vars=adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")]


##Notes on Workclass
#Disproportionate NAs on <=50K
#Significant difference in self-emp-inc
#Small differences elsewhere
ggplot(cat_vars, aes(x= workclass,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="workclass") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

##Notes on Education
#Assoc-voc and Assoc-acdm only not sig diff
ggplot(cat_vars, aes(x= education,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="education") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

##Notes on Marital Status
#Very sig differences in Married-civ-spouse, never-married,and divorced
ggplot(cat_vars, aes(x= marital_status,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="marital_status") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= occupation,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="occupation") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= relationship,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="relationship") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= race,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="race") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= sex,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="sex") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= native_country,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="native_country") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

filt_cat_vars=filter(cat_vars, native_country != "United-States")
summary(filt_cat_vars$native_country)
##                   Cambodia                     Canada 
##                         19                        121 
##                      China                   Columbia 
##                         75                         59 
##                       Cuba         Dominican-Republic 
##                         95                         70 
##                    Ecuador                El-Salvador 
##                         28                        106 
##                    England                     France 
##                         90                         29 
##                    Germany                     Greece 
##                        137                         29 
##                  Guatemala                      Haiti 
##                         64                         44 
##         Holand-Netherlands                   Honduras 
##                          1                         13 
##                       Hong                    Hungary 
##                         20                         13 
##                      India                       Iran 
##                        100                         43 
##                    Ireland                      Italy 
##                         24                         73 
##                    Jamaica                      Japan 
##                         81                         62 
##                       Laos                     Mexico 
##                         18                        643 
##                  Nicaragua Outlying-US(Guam-USVI-etc) 
##                         34                         14 
##                       Peru                Philippines 
##                         31                        198 
##                     Poland                   Portugal 
##                         60                         37 
##                Puerto-Rico                   Scotland 
##                        114                         12 
##                      South                     Taiwan 
##                         80                         51 
##                   Thailand            Trinadad&Tobago 
##                         18                         19 
##              United-States                    Vietnam 
##                          0                         67 
##                 Yugoslavia 
##                         16
ggplot(filt_cat_vars, aes(x= native_country,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="native_country") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(cat_vars, aes(x= income)) + geom_bar()

#75.919% of observations are <50K
dim(cat_vars[cat_vars$income=='<=50K',])[1]/
  (dim(cat_vars[cat_vars$income=='>50K',])[1]+dim(cat_vars[cat_vars$income=='<=50K',])[1])
## [1] 0.7591904

Looking at Numerical Variables

num_vars=adult %>% select_if(is.numeric)
num_vars$income=adult$income
num_vars$income=factor(num_vars$income)
variable.names(num_vars)
## [1] "age"            "fnlwgt"         "education_num"  "capital_gain"  
## [5] "capital_loss"   "hours_per_week" "income"
plot(num_vars[,1:6])

num_vars %>% ggplot(aes(x=income,y=age))+geom_boxplot()

#older people have higher income
num_vars %>% ggplot(aes(x=income,y=fnlwgt))+geom_boxplot()

#Education is significantly higher (we should think about if it will cause an issue to include both education variables)
num_vars %>% ggplot(aes(x=income,y=education_num))+geom_boxplot()

#Investigate probability if gains are more likely for higher income
num_vars %>% ggplot(aes(x=income,y=capital_gain))+geom_boxplot()

#Testing relation of gains and losses on income
test1=num_vars
test2=num_vars
test1$gains=ifelse(test1$capital_gain>0,'Gain','No Gain')
test2$losses=ifelse(test1$capital_loss>0,'Loss','No Loss')
test1$gains=factor(test1$gains)
test2$losses=factor(test2$losses)

ggplot(test1, aes(x= gains,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="gains") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

ggplot(test2, aes(x= losses,  group=income)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="gains") +
    facet_grid(~income) +
    scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")

#Investigate probability if losses are more likely for lower income
num_vars %>% ggplot(aes(x=income,y=capital_loss))+geom_boxplot()

num_vars %>% ggplot(aes(x=income,y=hours_per_week))+geom_boxplot()

#Generate a correlation and significance table and plot for all vars
res2 <- rcorr(as.matrix(num_vars[,1:6]),type="pearson")
corrplot(res2$r, type="upper", order="hclust", 
         p.mat = res2$P, sig.level =  0.1, insig = "blank")

#Test variables with Wilcox Test for significance
vars_num=variable.names(num_vars[,-7])
p_matrix= matrix(nrow=length(num_vars)-1, ncol=3)
for (i in 1:length(vars_num)){
a=wilcox.test(num_vars[,(i)]~num_vars$income,alternative="two.sided")
p_matrix[i,1]=vars_num[i]
p_matrix[i,2]=a[[3]]
p_matrix[i,3]=ifelse(a[[3]]<=.05,"keep","remove")
}

p_matrix
##      [,1]             [,2]                    [,3]    
## [1,] "age"            "0"                     "keep"  
## [2,] "fnlwgt"         "0.0526819376884781"    "remove"
## [3,] "education_num"  "0"                     "keep"  
## [4,] "capital_gain"   "0"                     "keep"  
## [5,] "capital_loss"   "7.02129357933078e-143" "keep"  
## [6,] "hours_per_week" "0"                     "keep"